This work relies on processed data from Kaggle https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store
This work is motivated by the publication https://arxiv.org/pdf/2102.01625.pdf
Further details on data processing available at: https://arxiv.org/pdf/2010.02503.pdf
Business Problem
Tree Based models
decision trees
non-linear
from IPython.display import Image
Image(filename='image1.png')
Image(filename='image10.png')
## Importing required Libraries
import os
import numpy as np
import pandas as pd
import seaborn as sns
### START CODE HERE ###
X_train = pd.read_csv('X_train.csv')
print(X_train.shape)
X_train.head()
### END CODE HERE ###
### START CODE HERE ###
nonNumFeat = X_train.dtypes[X_train.dtypes == object].index.tolist()
nonNumFeat
### END CODE HERE ###
### START CODE HERE ###
X_train.weekday.unique()
### END CODE HERE ###
X_train.timeOfDay.unique()
Remember, these feature values are ordered temporally
X_train['weekday'] = X_train['weekday'].replace(
['Mon','Tue','Wed','Thu','Fr','Sat','Sun'],
[1,2,3,4,5,6,7]
)
### START CODE HERE ###
X_train['timeOfDay'] = X_train.timeOfDay.replace(
['Dawn', 'EarlyMorning', 'Morning', 'Noon', 'Afternoon', 'Evening', 'Night'],
[1, 2, 3, 4, 5, 6, 7]
)
### END CODE HERE ###
print(X_train.shape)
X_train.head()
### START CODE HERE ###
X_test = pd.read_csv('X_test.csv')
print(X_test.shape)
X_test.head()
### END CODE HERE ###
### START CODE HERE ###
X_test['weekday'] = X_test['weekday'].replace(
['Mon','Tue','Wed','Thu','Fr','Sat','Sun'],
[1,2,3,4,5,6,7]
)
X_test['timeOfDay'] = X_test.timeOfDay.replace(
['Dawn', 'EarlyMorning', 'Morning', 'Noon', 'Afternoon', 'Evening', 'Night'],
[1, 2, 3, 4, 5, 6, 7]
)
### END CODE HERE ###
print(X_test.shape)
X_test.head()
### START CODE HERE ###
# Extract the 'Purchase' columns of the datasets as your targets
y_train = X_train.Purchase.values
y_test = X_test.Purchase.values
# You can change the wording of the print() commands from "Percentage" to "Fraction" if you prefer
print(f'Percentage of purchases in training = {(np.sum(y_train)*100)/len(y_train)}')
print(f'Percentage of purchases in test = {(np.sum(y_test)*100)/len(y_test)}')
### END CODE HERE ###
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
print(f'The training data currently has {X_train.shape[1]} features')
#Visual inspection of features
plt.figure(figsize=(12,10))
### START CODE HERE ###
# Compute the Pearson Correlation of each feature in the training df
cor = X_train.corr()
### END CODE HERE ###
# Plot the correlations
sns.heatmap(cor)
plt.show()
# Some intersting correlations pop up that need further analysis (later)
# we need to eliminate features that have very high absolute correlations
### START CODE HERE ###
cor = cor.fillna(1.0)
### END CODE HERE ###
### START CODE HERE ###
# Initialize a list indicating whether to keep a column in the training df
# For now, set all entries to True
keep_columns = [True]*len(cor.columns.tolist())
# Loop over all columns
for i in range(len(keep_columns)-2):
# Loop over all columns to the right of the current one
for j in range(i+1, len(keep_columns)-1):
# If the absolute correlation between the current two columns is greater
# than or equal to 0.8, or no correlation 0 value...
if abs(cor.iloc[i, j]) >= 0.8:
# If we haven't already told the keep_columns list to drop column j...
if keep_columns[j]:
# Drop column j
keep_columns[j] = False
# Extract the columns to keep from the training df
selected_columns = [c for c, k, in zip(cor.columns.tolist(), keep_columns) if k]
# Make a new df with the columns we've decided to keep from the training df
X_out = X_train[selected_columns]
### END CODE HERE ###
print('The following columns are present in the new data:')
print(selected_columns)
print(f'The old data had {X_train.shape[1]} features. The new df has {X_out.shape[1]} features.')
print("Shape of y:", y_train.shape)
X_out.head()
selected_columns = selected_columns[0:]
import statsmodels.api as sm
def backwardElimination(x, y, sl, columns):
numVars = len(x[0])
for i in range(0, numVars):
regressor_OLS = sm.OLS(y, x).fit()
#maxVar = max(regressor_OLS.pvalues).astype(float)
maxVar = max(regressor_OLS.pvalues)
if maxVar > sl:
for j in range(0, numVars - i):
if (regressor_OLS.pvalues[j].astype(float) == maxVar):
x = np.delete(x, j, 1)
columns = np.delete(columns, j)
regressor_OLS.summary()
return x, columns
SL = 0.01
# Note that we're passing in df_out, not df_train
data_modeled, selected_columns = backwardElimination(X_out.values, y_train, SL, selected_columns)
print('The following columns remain based on p-value selection:')
print(selected_columns)
# Make a reduced df
data_red = pd.DataFrame(data = data_modeled, columns = selected_columns)
print(f'After selection by Pearson Correlation, we had {X_out.shape[1]} features.')
print(f'After selection by p-value, we have {data_red.shape[1]} features.')
Ignore the warning messages about distplot. I should replace it with displot, but I haven't figured out how to overlay plots with it.
import warnings
warnings.filterwarnings('ignore')
fig = plt.figure(figsize = (20, 15))
j = 0
for i in data_red.columns:
plt.subplot(3, 4, j+1)
j += 1
sns.distplot(data_red[i][data_red['Purchase']==0], color='g', label = 'no')
sns.distplot(data_red[i][data_red['Purchase']==1], color='r', label = 'yes')
plt.legend(loc='best')
fig.suptitle('Subscription Feature Analysis')
fig.tight_layout()
fig.subplots_adjust(top=0.95)
plt.show()
from sklearn.preprocessing import normalize
# Note that this function outputs NumPy arrays by default
### START CODE HERE ###
X_train = X_out.drop(columns='Purchase')
X_train = normalize(X_train, axis=0, norm='max')
y_train = y_train
### END CODE HERE ###
### START CODE HERE ###
X_test = X_test.loc[:,keep_columns].drop(columns='Purchase')
X_test = normalize(X_test, axis=0, norm='max')
y_test = y_test
### END CODE HERE ###
print(X_train.shape)
print(X_test.shape)
#Use Random Forest to get feature ranks/importances for each feature
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
# Build a forest and compute the impurity-based feature importances
forest = ExtraTreesClassifier(n_estimators=20,
random_state=0)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), indices)
plt.xlim([-1, X_train.shape[1]])
plt.show()
X_out.columns
Note that X, y, and yhat must be NumPy arrays for this function to work
# This function visualizes the classification output on scatter plot
# Feature 1 (Interaction Time is used to observe the impact of low ranked feature)
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
def visualize_scatter_plot(X, y, yhat, title):
loc00=np.where((y==0) & (yhat==0),1,0) # TN: blue
loc10=np.where((y>0) & (yhat==0),1,0) # FN: cyan
loc11=np.where((y>0) & (yhat>0),1,0) # TP: red
loc01=np.where((y==0) & (yhat>0),1,0) # FP
fig = plt.figure(figsize = (10, 7))
ax = plt.axes(projection ="3d")
ax.scatter3D(X[np.where(loc00>0),3],X[np.where(loc00>0),4],X[np.where(loc00>0),1],color='blue', marker='o')
ax.scatter3D(X[np.where(loc10>0),3],X[np.where(loc10>0),4],X[np.where(loc10>0),1],color='cyan', marker='o')
ax.scatter3D(X[np.where(loc11>0),3],X[np.where(loc11>0),4],X[np.where(loc11>0),1],color='red', marker='^')
ax.scatter3D(X[np.where(loc01>0),3],X[np.where(loc01>0),4],X[np.where(loc01>0),1],color='green', marker='^')
ax.set_xlabel('NumCart')
ax.set_ylabel('NumViews')
ax.set_zlabel('InteractionTime')
plt.title(title)
plt.show()
Ensure each model is the fitted with the best hyperparameters. That includes mitigating the effects of any class imbalances.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.metrics import f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
Remember to use the test data when making and evaluating the predictions
### START CODE HERE ###
scaler = StandardScaler()
# regb = LogisticRegression(class_weight='balanced').fit(scaler.fit_transform(X_train), y_train)
# reg_predb = regb.predict(scaler.transform(X_test))
regb = LogisticRegression(class_weight='balanced', random_state=0).fit(X_train, y_train)
reg_predb = regb.predict(X_test)
### END CODE HERE ###
### START CODE HERE ###
cmlog = confusion_matrix(y_test, reg_predb)
acc = accuracy(y_test, reg_predb)
rec = recall(y_test, reg_predb)
prec = precision(y_test, reg_predb)
f1 = f1_score(y_test, reg_predb)
### END CODE HERE ###
# Print the metrics, display the confusion matrix, and visualize the model
print(f'Accuracy = {acc:.3f}, Precision = {prec:.3f}, Recall = {rec:.3f}, F1-score = {f1:.3f}')
print('Confusion Matrix is:')
print(cmlog)
visualize_scatter_plot(X_test,y_test,reg_predb,'Logistic Regression Balanced')
#No improvement
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
### START CODE HERE ###
nn_model = GradientBoostingClassifier().fit(X_train, y_train)
nn_pred = nn_model.predict(X_test)
### END CODE HERE ###
### START CODE HERE ###
cmnn = confusion_matrix(y_test, nn_pred)
acc = accuracy(y_test, nn_pred)
rec = recall(y_test, nn_pred)
prec = precision(y_test, nn_pred)
f1 = f1_score(y_test, nn_pred)
### END CODE HERE ###
# Print the metrics, display the confusion matrix, and visualize the model
print(f'Accuracy = {acc:.3f}, Precision = {prec:.3f}, Recall = {rec:.3f}, F1-score = {f1:.3f}')
print('Confusion Matrix is:')
print(cmnn)
visualize_scatter_plot(X_test,y_test,nn_pred,'NN Model')
### START CODE HERE ###
from sklearn.ensemble import RandomForestClassifier
# Instantiate the classifier
# Set the max depth to 5 and the random state to 0
clf = RandomForestClassifier(max_depth=5, random_state=0)
# Train the classifier
clf.fit(X_train, y_train)
# Use the classifier to make predictions from the test features
prediction = clf.predict(X_test)
# Compute the confusion matrix and evaluation metrics
cm = confusion_matrix(y_test, prediction)
acc = accuracy(y_test, prediction)
rec = recall(y_test, prediction)
prec = precision(y_test, prediction)
f1 = f1_score(y_test, prediction)
### END CODE HERE ###
# Print the metrics, display the confusion matrix, and visualize the model
print(f'Accuracy = {acc}, Precision = {prec}, Recall = {rec}, F1-score = {f1}')
print('Confusion Matrix is:')
print(cm)
visualize_scatter_plot(X_test, y_test, prediction, 'Random_forest_Unbalanced')
### START CODE HERE ###
# Instantiate the classifier
# Set the max depth to 5 and the random state to 0
clf = RandomForestClassifier(max_depth=5, random_state=0, class_weight='balanced')
# Train the classifier
clf.fit(X_train, y_train)
# Use the classifier to make predictions from the test features
prediction = clf.predict(X_test)
# Compute the confusion matrix and evaluation metrics
cm = confusion_matrix(y_test, prediction)
acc = accuracy(y_test, prediction)
rec = recall(y_test, prediction)
prec = precision(y_test, prediction)
f1 = f1_score(y_test, prediction)
### END CODE HERE ###
# Print the metrics, display the confusion matrix, and visualize the model
print(f'Accuracy = {acc}, Precision = {prec}, Recall = {rec}, F1-score = {f1}')
print('Confusion Matrix is:')
print(cm)
visualize_scatter_plot(X_test,y_test,prediction,'Random_forest_balanced_cost')
DON'T try running this in Colab. You'll get the following error message: ValueError: object of too small depth for desired array. It does seem to work when run locally, though.
yOneIdx = np.arange(0, len(y_train), dtype=np.int64)[y_train == 1]
yZeroIdx = np.random.choice(np.arange(0, len(y_train), dtype=np.int64)[y_train == 0],
len(y_train[y_train == 1]))
assert len(yOneIdx) == len(yZeroIdx)
sampleIdx = np.concatenate((yOneIdx, yZeroIdx))
from imblearn.ensemble import BalancedRandomForestClassifier
### START CODE HERE ###
# Instantiate, train, and predict with the classifier
clf = BalancedRandomForestClassifier(max_depth=5, random_state=0)
clf.fit(X_train[sampleIdx], y_train[sampleIdx])
predictionBR = clf.predict(X_test)
# Compute the confusion matrix and evaluation metrics
cm = confusion_matrix(y_test, predictionBR)
acc = accuracy(y_test, predictionBR)
rec = recall(y_test, predictionBR)
prec = precision(y_test, predictionBR)
f1 = f1_score(y_test, predictionBR)
### END CODE HERE ###
# Print the metrics, display the confusion matrix, and visualize the model
print(f'Accuracy = {acc}, Precision = {prec}, Recall = {rec}, F1-score = {f1}')
print('Confusion Matrix is:')
print(cm)
visualize_scatter_plot(X_test, y_test, predictionBR, 'Random_forest_balanced_trees')
cols = np.array((selected_columns[:-1])).astype(str)
print(cols)
print(clf.estimators_[0])
from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(
clf.estimators_[0],
max_depth=5,
out_file='tree.dot',
feature_names = cols,
rounded = True,
proportion = False,
precision = 2,
filled = True
)
# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')
| Classifier | Accuracy | Precision | Recall | F1 Score |
|---|---|---|---|---|
| Logistic Regression | 0.999 | 0.944 | 0.990 | 0.967 |
| Gradient Boosting | 0.999 | 0.954 | 0.982 | 0.968 |
I would report Logistic regression as the best model and NumOfEventsInJourney as the best features. Logistic regression gives higher recall score and we would like to get as many purchase as possible. While based on the tree genereated by decision trees, NumOfEventsInJourney is the first feature used for splitting.